To open and use this file you need to install the following: - R, RStudio
#install.packages("quanteda")
#install.packages("readtext")
#install.packages("wordcloud")
#install.packages("RColorBrewer")
#install.packages("wordcloud2")
#install.packages("tidyverse")
#install.packages("tm")
#install.packages("quanteda.textmodels")
#install.packages("quanteda.textstats")
#install.packages("quanteda.textplots")
#install.packages("broom")
This step is necessary to get the text data from the websites. In a first step, a list of links of all the subpages of a website is being created. Next, this list of links is processed and the text data is being retrieved and saved in .txt files. This was done in the terminal, not in R.
# run link discovery through website and store the resulting links in a file
$ trafilatura --sitemap "https://www.klimareporter.de" --list > klimareporterlinks.txt
# to process list of links and get texts
$ trafilatura -i klimareporterlinks.txt -o klimareporter_texts
# load libraries
library(quanteda)
library(readtext)
library(wordcloud)
library(RColorBrewer)
library(wordcloud2)
library(tidyverse)
library(tm)
library("textcat")
library("quanteda.textplots")
library("quanteda.textstats")
library("gsubfn")
library("spacyr")
library(broom)
spacy_initialize(model = "de_core_news_sm")
To open the previously created text files, we need to run the following code:
# you can get the current directory for importing the text files by setting current directory and open relative path from there with
#texts <- readtext("path/*")
#setwd("/Users/anna/Documents/textmining/textmining_climate")
# climate change activists texts
fff_de_texts <- readtext("text_files/pro/fff_de_texts/*")
ikem_texts <- readtext("text_files/pro/ikem_texts/*")
klimarep_texts <- readtext("text_files/pro/klimareporter_texts/*")
klimafakten_texts <- readtext("text_files/pro/klimafakten_texts/*")
zero_texts <- readtext("text_files/pro/germanzero_texts/*")
komma_texts <- readtext("text_files/pro/komma_texts/*")
# climate change sceptics texts
eike_texts <- readtext("text_files/contra/eike_texts/*")
# build corpus for each text with "origin" tag
# specify language for each text to get rid of non-German texts
# activists
fff_de_corpus <- corpus(fff_de_texts)
docvars(fff_de_corpus, "origin") <- "fff_de"
docvars(fff_de_corpus, "language") <- textcat(fff_de_corpus)
fff_de_corpus <- corpus_subset(fff_de_corpus, language == "german", drop_docid = TRUE)
ikem_corpus <- corpus(ikem_texts)
docvars(ikem_corpus, "origin") <- "ikem"
docvars(ikem_corpus, "language") <- textcat(ikem_corpus)
ikem_corpus <- corpus_subset(ikem_corpus, language == "german", drop_docid = TRUE)
klimarep_corpus <- corpus(klimarep_texts)
docvars(klimarep_corpus, "origin") <- "kr"
docvars(klimarep_corpus, "language") <- textcat(klimarep_corpus)
klimarep_corpus <- corpus_subset(klimarep_corpus, language == "german", drop_docid = TRUE)
klimafakten_corpus <- corpus(klimafakten_texts)
docvars(klimafakten_corpus, "origin") <- "kf"
docvars(klimafakten_corpus, "language") <- textcat(klimafakten_corpus)
klimafakten_corpus <- corpus_subset(klimafakten_corpus, language == "german", drop_docid = TRUE)
zero_corpus <- corpus(zero_texts)
docvars(zero_corpus, "origin") <- "zero"
docvars(zero_corpus, "language") <- textcat(zero_corpus)
zero_corpus <- corpus_subset(zero_corpus, language == "german", drop_docid = TRUE)
komma_corpus <- corpus(komma_texts)
docvars(komma_corpus, "origin") <- "gk"
docvars(komma_corpus, "language") <- textcat(komma_corpus)
komma_corpus <- corpus_subset(komma_corpus, language == "german", drop_docid = TRUE)
# sceptics
eike_corpus <- corpus(eike_texts)
docvars(eike_corpus, "origin") <- "eike"
docvars(eike_corpus, "language") <- textcat(eike_corpus)
eike_corpus <- corpus_subset(eike_corpus, language == "german", drop_docid = TRUE)
# build a PRO corpus for all activists texts
# create a "group" tag with value "activists"
pro_corpus <- fff_de_corpus+ikem_corpus+klimarep_corpus+klimafakten_corpus+zero_corpus+komma_corpus
docvars(pro_corpus, "group") <- "activists"
# build a CONTRA corpus for all sceptics texts
# create a "group" tag with value "sceptics"
contra_corpus <- eike_corpus
docvars(contra_corpus, "group") <- "sceptics"
# get random sample corpus for activists
pro2000 <- corpus_sample(pro_corpus, size = 2000)
# get random sample corpus for sceptics
contra2000 <- corpus_sample(contra_corpus, size = 2000)
# create "full" (combined) corpus with pro and contra sample
full_corpus <- pro2000+contra2000
# get id number for corpus
docvars(pro2000, "id") <- paste(1:ndoc(pro2000))
docvars(contra2000, "id") <- paste(1:ndoc(contra2000))
# save corpus files as .rds file for later use
saveRDS(full_corpus, "corpora/full_corpus.rds")
saveRDS(pro_corpus, "corpora/pro_corpus.rds")
saveRDS(contra_corpus, "corpora/contra_corpus.rds")
saveRDS(pro2000, "corpora/pro2000.rds")
saveRDS(contra2000, "corpora/contra2000.rds")
# activists
saveRDS(fff_de_corpus, "corpora/fff_de_corpus.rds")
saveRDS(ikem_corpus, "corpora/ikem_corpus.rds")
saveRDS(klimarep_corpus, "corpora/klimarep_corpus.rds")
saveRDS(klimafakten_corpus, "corpora/klimafakten_corpus.rds")
saveRDS(zero_corpus, "corpora/zero_corpus.rds")
saveRDS(komma_corpus, "corpora/komma_corpus.rds")
# sceptics
saveRDS(eike_corpus, "corpora/eike_corpus.rds")
# load corpus files
full_corpus = readRDS("corpora/full_corpus.rds")
pro_corpus = readRDS("corpora/pro_corpus.rds")
contra_corpus = readRDS("corpora/contra_corpus.rds")
pro2000 = readRDS("corpora/pro2000.rds")
contra2000 = readRDS("corpora/contra2000.rds")
# optional: load
fff_de_corpus = readRDS("corpora/fff_de_corpus.rds")
ikem_corpus = readRDS("corpora/ikem_corpus.rds")
klimarep_corpus = readRDS("corpora/klimarep_corpus.rds")
klimafakten_corpus = readRDS("corpora/klimafakten_corpus.rds")
zero_corpus = readRDS("corpora/zero_corpus.rds")
komma_corpus = readRDS("corpora/komma_corpus.rds")
eike_corpus = readRDS("corpora/eike_corpus.rds")
First, we want to have a look at the information each of the corpora gives us: - types - tokens - number of sentences - origin - language - group - id
# retrieve overview of corpus information
summary(pro2000, n = 10)
Corpus consisting of 2000 documents, showing 10 documents:
Text Types Tokens Sentences origin language group id
ikem_01086.txt 207 551 20 ikem german activists 1
ikem_01158.txt 27 30 2 ikem german activists 2
gerzero_00045.txt 40 44 3 zero german activists 3
ikem_01396.txt 27 30 2 ikem german activists 4
ikem_01532.txt 216 855 28 ikem german activists 5
ikem_01553.txt 169 291 12 ikem german activists 6
ikem_00191.txt 268 462 18 ikem german activists 7
kr_00033.txt 323 575 31 kr german activists 8
ikem_00343.txt 26 35 1 ikem german activists 9
ikem_01502.txt 12 12 1 ikem german activists 10
summary(contra2000, n=10)
Corpus consisting of 2000 documents, showing 10 documents:
Text Types Tokens Sentences origin language group id
eike_10314.txt 2894 11548 545 eike german sceptics 1
eike_06091.txt 452 867 41 eike german sceptics 2
eike_05388.txt 21 26 2 eike german sceptics 3
eike_09888.txt 1728 4534 234 eike german sceptics 4
eike_01977.txt 44 50 3 eike german sceptics 5
eike_06182.txt 46 56 5 eike german sceptics 6
eike_04363.txt 503 1014 56 eike german sceptics 7
eike_08508.txt 20 25 2 eike german sceptics 8
eike_05567.txt 596 1266 58 eike german sceptics 9
eike_03672.txt 18 21 1 eike german sceptics 10
The overview of the corpus information reveals that the sceptics corpus may consist of much longer texts (see “Sentences” counts) than the activists corpus. We want to re-check this information by plotting the sentences coutns for a subset of the data.
# retrieve corpus information (only 50 entries)
contra2000_sum <- summary(contra2000, n=50)
pro2000_sum <- summary(pro2000, n=50)
# create plots from corpus information
ggplot(pro2000_sum, aes(id, Sentences, group=1)) +
geom_line() +
geom_point() +
theme(axis.text.x = element_text(angle=0, vjust=1, hjust=1)) +
ggtitle("Sentences Pro2000")
ggplot(contra2000_sum, aes(id, Sentences, group=1)) +
geom_line() +
geom_point() +
theme(axis.text.x = element_text(angle=0, vjust=1, hjust=1)) +
ggtitle("Sentences Contra2000")
Now we can caluculate the mean count of the sentences by running the following code.
sents_pro = summary(pro2000, n=ndoc(pro2000))$Sentences
mean(sents_pro)
[1] 24.7015
sents_con = summary(contra2000, n=ndoc(contra2000))$Sentences
mean(sents_con)
[1] 73.8075
The results show, that the texts of the activists corpus consist of averagely 24,7 sentences, meanwhile the texts of the sceptics corpus have an average number of sentences of 73,8. This suggests that the texts of the sceptics corpus are much longer than the ones in the other corpus.
Now let’s have a look at the TTR of both corpora.
# retrieve TTR for pro corpus
ttr_p2000 <- textstat_lexdiv(dfm_p2000, measure ="TTR", reove_numbers = TRUE, remove_punct = TRUE,
remove_symbols = TRUE)
# get mean value
mean(ttr_p2000$TTR)
[1] 0.8426561
# retrieve TTR for contra corpus
ttr_c2000 <- textstat_lexdiv(dfm_c2000, measure ="TTR", reove_numbers = TRUE, remove_punct = TRUE,
remove_symbols = TRUE)
# get mean value
mean(ttr_c2000$TTR)
[1] 0.8463942
The closer the value approximates to 1, the greater the lexical richness. Both corpora appear to have very similar TTRs and thus seem to reveal very similar lexical richness of the texts.
# retrieve stoplists
de_stopwords <- stopwords::stopwords("de", source="snowball")
en_stopwords <- stopwords::stopwords("en", source="snowball" )
custom_stopwords <- read.table("de_complete.txt", header=F, sep="\n")
# add own stopwords
full_stopwords <- c(de_stopwords, "dass", "=", "the", "seit", "ab", "beim", "\n", "mal", "c", "|", "m", "kommentare", "neueste", "gepostet", custom_stopwords, en_stopwords)
de_stopwords1 <- c(de_stopwords, "dass", "=", "the", "seit", "ab", "beim", "\n", "mal", "c", "\\|","|", "m", "kommentare", "neueste", "gepostet", "admin", "cookies", "inhalte", "inhalt", "newsletter", "posten", "zugriff", "passwort", "geschützt", "seite", "website", "webseite", "and", "0", "1", "2", "3","4","5","6","7","8","9", "mfg","w","t","wer")
# create dfm
dfm_p2000 <- dfm(pro2000, remove=full_stopwords, remove_punct=TRUE, remove_numbers=TRUE, tolower=TRUE)
Warnung: 'dfm.corpus()' is deprecated. Use 'tokens()' first.
Warnung: '...' should not be used for tokens() arguments; use 'tokens()' first.
Warnung: 'remove' is deprecated; use dfm_remove() instead
dfm_c2000 <- dfm(contra2000, remove=full_stopwords, remove_punct=TRUE, remove_numbers=TRUE, tolower=TRUE)
Warnung: 'dfm.corpus()' is deprecated. Use 'tokens()' first.
Warnung: '...' should not be used for tokens() arguments; use 'tokens()' first.
Warnung: 'remove' is deprecated; use dfm_remove() instead
Let’s have a look at the dfm of both corpora:
# pro corpus dfm
dfm_p2000
Document-feature matrix of: 2,000 documents, 60,294 features (99.72% sparse) and 4 docvars.
features
docs ikem aktuellen eeg-erfahrungsbericht beteiligt webseite bundesministeriums wirtschaft
ikem_01086.txt 7 1 2 2 2 2 3
ikem_01158.txt 1 0 0 0 0 0 0
gerzero_00045.txt 0 0 0 0 0 0 0
ikem_01396.txt 1 0 0 0 0 0 0
ikem_01532.txt 3 0 0 0 0 1 3
ikem_01553.txt 1 0 0 0 0 0 0
features
docs energie energiewende wurde
ikem_01086.txt 3 2 2
ikem_01158.txt 0 0 0
gerzero_00045.txt 0 0 0
ikem_01396.txt 0 0 0
ikem_01532.txt 0 0 1
ikem_01553.txt 2 0 1
[ reached max_ndoc ... 1,994 more documents, reached max_nfeat ... 60,284 more features ]
# contra corpus dfm
dfm_c2000
Document-feature matrix of: 2,000 documents, 146,894 features (99.70% sparse) and 4 docvars.
features
docs modellierung kiehl trenberth wirft zahlreiche fragen gibt deutliche abweichungen
eike_10314.txt 6 7 6 1 1 3 14 1 1
eike_06091.txt 0 0 0 0 0 0 1 0 0
eike_05388.txt 0 0 0 0 0 0 0 0 0
eike_09888.txt 0 0 0 0 1 1 9 0 0
eike_01977.txt 0 0 0 0 0 0 0 0 0
eike_06182.txt 0 0 0 0 0 0 0 0 0
features
docs satelliten
eike_10314.txt 4
eike_06091.txt 0
eike_05388.txt 0
eike_09888.txt 0
eike_01977.txt 0
eike_06182.txt 0
[ reached max_ndoc ... 1,994 more documents, reached max_nfeat ... 146,884 more features ]
The document-feature matrix basically consists of rows for each text and columns for each word in the texts. The column values reflect how many times a term appears in a text - if a term does not occur in a text, its value is zero.
To clean the corpora lemmatization and application of stop lists (see previous step) was performed. For the lemmatization, the spacyr library was used.
Info: This piece of code takes some time to run.
# parse the pro corpus with spacy function and retrieve lemma for each token
sp_pro2000 <- spacy_parse(pro2000, pos=FALSE, entity=FALSE, dependency=FALSE)
Warnung in spacy_parse.character(pro2000, pos = FALSE, entity = FALSE, dependency = FALSE)
lemmatization may not work properly in model 'de_core_news_sm'
sp_pro2000$token <- sp_pro2000$lemma
# create lemmatized version of dfm for activists corpus
sp_dfm_p2000 <- as.tokens(sp_pro2000)%>%
dfm(remove=full_stopwords, remove_punct=TRUE, remove_numbers=TRUE, tolower=TRUE)
Warnung: '...' should not be used for tokens() arguments; use 'tokens()' first.
Warnung: 'remove' is deprecated; use dfm_remove() instead
# parse the contra corpus with spacy function and retrieve lemma for each token
sp_contra2000 <- spacy_parse(contra2000, pos=FALSE, entity=FALSE, dependency=FALSE)
Warnung in spacy_parse.character(contra2000, pos = FALSE, entity = FALSE,
lemmatization may not work properly in model 'de_core_news_sm'
sp_contra2000$token <- sp_contra2000$lemma
# create lemmatized version of dfm for sceptics corpus
sp_dfm_c2000 <- as.tokens(sp_contra2000)%>%
dfm(remove=full_stopwords, remove_punct=TRUE, remove_numbers=TRUE, tolower=TRUE)
Warnung: '...' should not be used for tokens() arguments; use 'tokens()' first.
Warnung: 'remove' is deprecated; use dfm_remove() instead
# parse the full corpus with spacy function and retrieve lemma for each token
sp_full <- spacy_parse(full_corpus, pos=FALSE, entity=FALSE, dependency=FALSE)
Warnung in spacy_parse.character(full_corpus, pos = FALSE, entity = FALSE,
lemmatization may not work properly in model 'de_core_news_sm'
sp_full$token <- sp_full$lemma
# create lemmatized version of dfm for sceptics corpus
sp_dfm_full <- as.tokens(sp_full) %>%
dfm(remove=full_stopwords, remove_punct=TRUE, remove_numbers=TRUE, tolower=TRUE)
Warnung: '...' should not be used for tokens() arguments; use 'tokens()' first.
Warnung: 'remove' is deprecated; use dfm_remove() instead
Comment: The German lemmatization with spacyr is not very accurate. A lot of compounds do not get lemmatized at all and therefore appear multiple times (in all possible forms) in the dfm.
With the help of ´topfeatures´ we can check the most frequently occuring terms for each dfm.
# check top 50 terms for activists corpus
topfeatures(sp_dfm_p2000, n=50)
mehr uhr ikem mensch jahr weit
4284 3497 2655 2463 2062 1698
deutschland geben energie thema aktuell sollen
1643 1603 1592 1401 1362 1322
groß gehen klimaschutz gut sowie immer
1317 1301 1289 1276 1209 1156
bleiben information müssen schon dabei future
1151 1150 1144 1084 1019 992
arbeit stehen energiewende politik finden welch
976 925 900 895 891 886
land projekt ziel wichtig berlin kommen
876 870 862 852 846 838
fridays erfahren jed zukunft klimakrise newsletter
831 825 815 813 796 779
ganz neu rahmen laufende möglich erst
769 750 747 740 735 721
anmelden frage
719 717
# check top 50 terms for sceptics corpus
topfeatures(sp_dfm_c2000, n=50)
jahr geben mehr schon immer gut kommen co2 weit
8926 6744 6044 4549 4348 3955 3880 3839 3715
ja gehen deutschland sollen mensch welch groß sagen jed
3702 3700 3597 3490 3427 3325 3323 3265 3177
energie hoch wenig zeigen ganz global temperatur ° müssen
3012 2869 2864 2776 2684 2652 2587 2568 2393
sehen herr einfach natürlich klima zeit strom stehen heute
2370 2249 2217 2207 2207 2149 2130 2101 2073
finden frage erwärmung erst klimawandel etwa land liegen welt
2042 2020 1979 1974 1951 1949 1901 1900 1863
wer genau wissen atmosphäre tun
1835 1811 1764 1754 1754
Since those lists are not easy to handle, we will create a plot of the information in the following sections.
To plot the most frequent words of each corpus, let’s run the following code:
# get frequencies of a sample of 50 words from the corpora
freq_p2000 <- textstat_frequency(sp_dfm_p2000, n=50)
freq_c2000 <- textstat_frequency(sp_dfm_c2000, n=50)
# create frequency plots
plot_p2000$feature <- with(freq_p2000, reorder(feature, -frequency))
plot_c2000$feature <- with(freq_c2000, reorder(feature, -frequency))
# create plot for activists corpus word frequencies
plot1 <- ggplot(freq_p2000, aes(x=feature, y=frequency)) +
geom_point()+ggtitle("P2000 Frequencies")+
theme(axis.text.x = element_text(angle=90,hjust=1))
plot1
# create plot for sceptics corpus word frequencies
plot2 <- ggplot(freq_c2000, aes(x=feature, y=frequency)) +
geom_point()+ ggtitle("C2000 Frequencies")+
theme(axis.text.x = element_text(angle=90,hjust=1))
plot2
This already gives us a first impression of the content of the corpora texts.
Nevertheless, we are particularly interested in the word frequencies of German climate change compound nouns. Accordingly, we start with the retrieval of terms starting with “klima” to hopefully get some climate change compounds. In the enxt step, again we retrieve the frequencies of the words and create plots for both corpora.
# create a sample of the dfm with all words starting with "klima..."
klima_p2000 <- dfm_select(sp_dfm_p2000, pattern="klima*")
klima_c2000 <- dfm_select(sp_dfm_c2000, pattern="klima*")
# retrieve frequencies for a sample of 50 words
freq_klima_p2000 <- textstat_frequency(klima_p2000, n=50)
freq_klima_c2000 <- textstat_frequency(klima_c2000, n=50)
# create frequency plots for "klima" words
freq_klima_p2000$feature <- with(freq_klima_p2000, reorder(feature, -frequency))
freq_klima_c2000$feature <- with(freq_klima_c2000, reorder(feature, -frequency))
# create plot for activists corpus "klima" word frequencies
plot3 <- ggplot(freq_klima_p2000, aes(x=feature, y=frequency)) +
geom_point()+ggtitle("P2000 'Klima' Word Frequencies")+
theme(axis.text.x = element_text(angle=90,hjust=1))
plot3
# create plot for scpetics corpus "klima" word frequencies
plot4 <- ggplot(freq_klima_c2000, aes(x=feature, y=frequency)) +
geom_point()+ ggtitle("C2000 'Klima' Word Frequencies")+
theme(axis.text.x = element_text(angle=90,hjust=1))
plot4
In the following step we retrieve lists of the terms which only appear in one of the Top50 lists. For activists group:
# get list of climate change compounds only appearing in top50 for activists
setdiff(freq_klima_p2000$feature, freq_klima_c2000$feature)
[1] "klimapolitische" "klimagerechtigkeit"
[3] "klimastreik" "klimaneutralität"
[5] "klimafinanzierung" "klimaschutzmaßnahmen"
[7] "klimareporter" "klimaschutzgesetz"
[9] "klimaschutzziele" "klimanotstand"
[11] "klimacamp" "klimapaket"
[13] "klimagerechte" "klimakommunikation"
[15] "klimaschädlichen" "klimafreundliche"
[17] "klimafakten.de" "klimaschädliche"
[19] "klimabewegung" "klimaneutrale"
[21] "klimagerechtigkeitsbewegung" "klimaplan"
[23] "klimakonferenzen" "klimastreiks"
[25] "klimawissen" "klimaschutzpolitik"
[27] "klimaabkommens" "klimapolitischen"
[29] "klimawahl" "klimazielen"
[31] "klimaneutralen" "klimaziel"
[33] "klimafreundlichen" "klimagesetz"
For sceptics group:
# get list of climate change compounds only appearing in top50 for activists
setdiff(freq_klima_c2000$feature, freq_klima_p2000$feature)
[1] "klimawissenschaft" "klimamodelle" "klimawissenschaftler"
[4] "klimasensitivität" "klimamodellen" "klimaleugner"
[7] "klimaretter" "klimatologie" "klimaänderungen"
[10] "klimaalarmisten" "klimaskeptiker" "klimarettung"
[13] "klimafragen" "klimatisch" "klimasystem"
[16] "klimahysterie" "klimakirche" "klimaentwicklung"
[19] "klima-alarmisten" "klimaretter.info" "klimagipfel"
[22] "klimaaktivisten" "klimadebatte" "klimatologen"
[25] "klimamodell" "klimaexperten" "klimafolgenforschung"
[28] "klimaveränderungen" "klimaschau" "klimareligion"
[31] "klima-alarmismus" "klimaschutzplan" "klimavertrag"
[34] "klimahysteriker"
To get weighted frequencies of the corpora, it is necessary to have a look at the tf-idf (term frequency-inverse document freqeuncy) of the words.
# weighted words
p2000_weight <- dfm_weight(sp_dfm_p2000, scheme="prop")
c2000_weight <- dfm_weight(sp_dfm_c2000, scheme="prop")
relfreq_p2000 <- textstat_frequency(p2000_weight, n=50)
relfreq_c2000 <- textstat_frequency(c2000_weight, n=50)
#tfidf
p2000_tfidf <- dfm_tfidf(sp_dfm_p2000, scheme_tf = "prop", scheme_df = "inverse")
c2000_tfidf <- dfm_tfidf(sp_dfm_c2000, scheme_tf = "prop", scheme_df = "inverse")
#plot3 <- with(relfreq_p2000, reorder(feature, -freqency))
relfreq_p2000$feature <- with(relfreq_p2000, reorder(feature, -frequency))
plot3 <- ggplot(relfreq_p2000, aes(x=feature, y=frequency)) +
geom_point()+ggtitle("P2000 Frequencies")+
theme(axis.text.x = element_text(angle=90,hjust=1))
#ggsave(plot=plot1, width = 10, height = 5, dpi=300, filename="klima_eike_plot.jpeg" )
plot3
# weighted words
p2000_klima_weight <- dfm_weight(klima_p2000, scheme="prop")
c2000_klima_weight <- dfm_weight(klima_c2000, scheme="prop")
relfreq_p2000 <- textstat_frequency(p2000_klima_weight, n=50)
relfreq_c2000 <- textstat_frequency(c2000_klima_weight, n=50)
#tfidf
p2000_tfidf <- dfm_tfidf(sp_dfm_p2000, scheme_tf = "prop", scheme_df = "inverse")
c2000_tfidf <- dfm_tfidf(sp_dfm_c2000, scheme_tf = "prop", scheme_df = "inverse")
relfreq_p2000$feature <- with(relfreq_p2000, reorder(feature, -frequency))
plot7 <- ggplot(relfreq_p2000, aes(x=feature, y=frequency)) +
geom_point()+ggtitle("P2000 Klima Words - Relative Frequencies")+
theme(axis.text.x = element_text(angle=90,hjust=1))
plot7
relfreq_c2000$feature <- with(relfreq_c2000, reorder(feature, -frequency))
plot8 <- ggplot(relfreq_c2000, aes(x=feature, y=frequency)) +
geom_point()+ggtitle("C2000 Klima Words - Relative Frequencies")+
theme(axis.text.x = element_text(angle=90,hjust=1))
plot8
p2000_tfidf <- dfm_tfidf(klima_p2000, scheme_tf = "prop", scheme_df = "inverse")
c2000_tfidf <- dfm_tfidf(klima_c2000, scheme_tf = "prop", scheme_df = "inverse")
pro_freq_tfidf <- p2000_tfidf %>%
textstat_frequency(n=20, force=TRUE)
con_freq_tfidf <- c2000_tfidf %>%
textstat_frequency(n=20, force=TRUE)
tplot_tfidf_p2000 <- ggplot(data=pro_freq_tfidf,
aes(x=factor(nrow(pro_freq_tfidf):1),
y=frequency)) +
geom_point() +
ggtitle("P2000 'Klima' Words - Relative Frequencies")+
coord_flip() +
scale_x_discrete(breaks=factor(nrow(pro_freq_tfidf):1),
labels=pro_freq_tfidf$feature) +
labs(x=NULL, y="tf-idf")
tplot_tfidf_p2000
tplot_tfidf_c2000 <- ggplot(data=con_freq_tfidf,
aes(x=factor(nrow(con_freq_tfidf):1),
y=frequency)) +
geom_point() +
ggtitle("C2000 'Klima' Words - Relative Frequencies")+
coord_flip() +
scale_x_discrete(breaks=factor(nrow(con_freq_tfidf):1),
labels=con_freq_tfidf$feature) +
labs(x=NULL, y="tf-idf")
tplot_tfidf_c2000
Additionally, let’s have a look at the climate change terms for each website we created our corpora from. All except from EIKE are used to construct the activists corpus.
The following plot provides a lot of information about the usage of terms between both groups - activists and sceptics.
freqs_pro <- textstat_frequency(p2000_klima_weight)
freqs_con <- textstat_frequency(c2000_klima_weight)
#plotting
freqs.act <- filter(freqs_pro) %>% as.data.frame() %>% select(feature, frequency)
freqs.scept <- filter(freqs_con) %>% as.data.frame() %>% select(feature, frequency)
freqs <- left_join(freqs.act, freqs.scept, by = "feature") %>% head(30) %>% arrange(frequency.x) %>% mutate(feature = factor(feature, feature))
p <- ggplot(freqs) +
geom_segment(aes(x=feature, xend=feature, y=frequency.x, yend=frequency.y), color="grey") +
geom_point(aes(x=feature, y=frequency.x, colour="Activists"), size = 3) +
geom_point(aes(x=feature, y=frequency.y, colour="Sceptics"), size = 3) +
ggtitle("Comparison 'Klima' Word Frequencies per Group") +
xlab("") + ylab("Frequency") +
coord_flip()
p+labs(colour="Group")
Warnung: Removed 1 rows containing missing values (geom_segment).
Warnung: Removed 1 rows containing missing values (geom_point).
ggsave("comparison_klima_freqs.png", dpi=300, dev='png', height=8, width=12, units="in")
Warnung: Removed 1 rows containing missing values (geom_segment).
Warnung: Removed 1 rows containing missing values (geom_point).
p2000_colls
c2000_colls
p2000_colls$collocation <- with(p2000_colls, reorder(collocation, -count))
plot9 <- ggplot(p2000_colls, aes(x=collocation, y=count)) +
geom_point()+ggtitle("P2000 Collocation Frequencies")+
theme(axis.text.x = element_text(angle=90,hjust=1))
plot9
ggsave(plot=plot9, width = 10, height = 5, dpi=300, filename="p2000_colls.jpeg")
c2000_colls$collocation <- with(c2000_colls, reorder(collocation, -count))
plot10 <- ggplot(c2000_colls, aes(x=collocation, y=count)) +
geom_point()+ggtitle("C2000 Collocation Frequencies")+
theme(axis.text.x = element_text(angle=90,hjust=1))
plot10
ggsave(plot=plot10, width = 10, height = 5, dpi=300, filename="c2000_colls.jpeg")
# Create a dfm grouped by president
group_dfm <- tokens(full_corpus, remove_punct = TRUE) %>%
tokens_remove(de_stopwords1) %>%
tokens_group(groups = group) %>%
dfm()
group_dfm_klima <- dfm_select(group_dfm, pattern ="klima*")
# Calculate keyness and determine Trump as target group
result_keyness <- textstat_keyness(corp_dfm_klima, target = "activists")
# Plot estimated word keyness
textplot_keyness(result_keyness, margin=0.2, n=15, color=c("lightblue", "red"))
kwic_pro <- kwic(pro2000, pattern="klimaschutz", window=5) %>%
as_tibble()
Warnung: 'kwic.corpus()' is deprecated. Use 'tokens()' first.
head(kwic_pro, n=20)
#write.csv(kwic_pro, "kwic_pro_klimaschutz.csv")
kwic_con <- kwic(contra2000, pattern="klimaschutz", window=5) %>%
as_tibble()
Warnung: 'kwic.corpus()' is deprecated. Use 'tokens()' first.
head(kwic_con, n=20)
kwic_con <- kwic(contra2000, pattern="klimakrise", window=5) %>%
as_tibble()
Warnung: 'kwic.corpus()' is deprecated. Use 'tokens()' first.
head(kwic_con, n=20)
kwic_con <- kwic(contra2000, pattern="klimahysterie", window=5) %>%
as_tibble()
Warnung: 'kwic.corpus()' is deprecated. Use 'tokens()' first.
head(kwic_con, n=20)
write.csv(head(kwic_con, n=20), "kwic_con_klimahysterie.csv")
kwic_pro <- kwic(pro2000, pattern="klimakrise", window=5) %>%
as_tibble()
Warnung: 'kwic.corpus()' is deprecated. Use 'tokens()' first.
head(kwic_pro, n=20)
write.csv(head(kwic_pro, n=20), "kwic_pro_klimakrise.csv")